林嶔 (Lin, Chin)
Lesson 4
過度擬合問題 - 這限制網路的總參數量,限制了網路的複雜度
梯度消失問題 - 這讓網路難以訓練,限制了網路的深度
權重初始化問題 - 這使訓練結果很不穩定,限制了網路的準確度
– 除此之外,在深入探討他的解決方法之前,我們再多學習一些基本的機器學習及統計學的基礎知識,而這些知識會讓我們對目前所學會的MLP結構做很好的擴展,從而增加他的實用性。
\[\hat{y} = f(x)\]
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ o & = S(l_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_m(h_1^E,W^2_m) \end{align} \]
\[ \begin{align} Softmax(x_j) & = \frac{e^{x_j}}{\sum \limits_{i=1}^{m} e^{x_i}} \ \ \ \ \ j \in 1 \ \mbox{to} \ m \end{align} \]
\[ \begin{align} \frac{\partial}{\partial x_i}Softmax(x_j) & = \left\{ \begin{array} -Softmax(x_j)(1 - Softmax(x_i)) & \mbox{ if i = j} \\ Softmax(x_j)(0 - Softmax(x_i)) & \mbox{ otherwise} \end{array} \right. \end{align} \]
– 之前我們為了邏輯斯迴歸設計了交叉熵損失函數,使其在最終函數求解時較為平穩,現在我們也要為函數\(Softmax()\)設計一個特異的損失函數,他的名字叫做\(m\)對數似然函數(\(m\)-log likelihood function):
– 這裡的\(m\)同樣代表著類別數,而\(n\)代表著樣本數:
\[ \begin{align} mlogloss(y_{i}, p_{i}) & = - \sum \limits_{i=1}^{m} y_{i} log(p_{i}) \end{align} \]
– 這裡我們把偏導函數列出來(過程略):
\[ \begin{align} \frac{\partial}{\partial p_i}mlogloss(y_{i}, p_{i}) & = - \sum \limits_{i=1}^{m} y_{i} \frac {1} {p_{i}} \end{align} \]
– 接著我們將\(Softmax()\)以及\(mlogloss()\)兩個函數合起來,得到的偏導函數就會相當優美:
\[ \begin{align} p_i &= Softmax(x_i)\\ loss &= mlogloss(y_{i}, p_{i}) \\\\ \frac{\partial}{\partial x_j}loss & = \frac{\partial}{\partial p_j}loss \frac{\partial}{\partial x_j}p_i \\ & = - y_{j} \frac {1} {p_{j}} p_{j} ( 1-p_{j})-\sum \limits_{i\neq j} y_{i} \frac {1} {p_{j}} p_{j} (0 - p_{j}) \\ & = - y_{j} ( 1-p_{j})-\sum \limits_{i\neq j} y_{i} (0 - p_{j}) \\ & = - y_{j} +y_{j}p_{j}+\sum \limits_{i\neq j} y_{i} p_{j} \\ & = - y_{j} +\sum \limits_{i = 1}^{m} y_{i} p_{j} \\ & = p_{j} - y_{j} \end{align} \]
– 其數據集包含了150個樣本,都屬於鳶尾屬下的三個亞屬,分別是山鳶尾(setosa)、變色鳶尾(versicolor)和維吉尼亞鳶尾(virginica)。四個特徵被用作樣本的定量分析,它們分別是花萼(sepal)和花瓣(petal)的長度和寬度。
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
idx = sample(1:length(x1), batch_size)
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = y[idx], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
}
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
X_matrix = X, Y_matrix = Y) {
#Functions
#Forward
Softmax.fun = function (x, eps = eps) {
out <- apply(x, 1, function (i) {exp(i)/sum(exp(i))})
return(t(out))
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
Mlogloss.fun = function (o, y, eps = eps) {
loss = -1/nrow(y) * sum(y * log(o + eps))
return(loss)
}
#Backward
grad_o_s.fun = function (o, y) {
return(o - y)
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
idx = sample(1:nrow(X_matrix), batch_size)
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = Softmax.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = Mlogloss.fun(o = current_o, y = Y_matrix[idx,], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_l_list[[len_h+1]] = grad_o_s.fun(o = current_o, y = Y_matrix[idx,])
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
return(W_list)
}
PRED_MLP = function (X_matrix, W_list, eps = 1e-8) {
#Functions
#Forward
Softmax.fun = function (x, eps = eps) {
out <- apply(x, 1, function (i) {exp(i)/sum(exp(i))})
return(t(out))
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
len_h = length(W_list) - 1
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = X_matrix, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = Softmax.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
set.seed(0)
TRAIN.seq = sample(1:150, 100)
TRAIN.X = X[TRAIN.seq,]
TRAIN.Y = Y[TRAIN.seq,]
TEST.X = X[-TRAIN.seq,]
TEST.Y = Y[-TRAIN.seq,]
W_LIST = DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
X_matrix = TRAIN.X, Y_matrix = TRAIN.Y)
PRED_Y = PRED_MLP(X_matrix = TEST.X, W_list = W_LIST, eps = 1e-8)
table(max.col(PRED_Y), max.col(TEST.Y))
##
## 1 2 3
## 1 18 0 0
## 2 0 13 2
## 3 0 2 15
– 這樣看起來,增加變異是一件相當重要的事情,並且在實驗中我們還發現他對於測試集的準確度是有幫助的。那現在的問題是,我們有沒有可能再更進一步的增加變異,以進一步增加模型準確度。
– 這個手法統稱為資料擴增(data augmentation),其方法及種類的多樣性足夠上一整學期的課。我們這裡先小試身手,之後的課程還有許多部份同樣涉及資料擴增的部分會再持續補充!
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
sd_Augmentation = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
idx = sample(1:length(x1), batch_size)
#Augmentation
New_X_matrix = X_matrix[idx,]
New_X_matrix = New_X_matrix + rnorm(prod(dim(New_X_matrix)), sd = sd_Augmentation)
New_y = y[idx]
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = New_X_matrix, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = New_y, eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2) {
new_X = cbind(x1, x2)
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
pred_y = pre_func(x1 = x1, x2 = x2)
MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
return(list(Train_tab = table(pred_y > 0.5 + 0L, y), Test_tab = table(pred_test_y > 0.5 + 0L, test_y)))
}
set.seed(0)
x1 = rnorm(100, sd = 1)
x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L
test_x1 = rnorm(100, sd = 1)
test_x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L
– 讓我們比較一下沒有做資料擴增(sd_Augmentation = 0)與做了資料擴增(sd_Augmentation = 0.3/0.8)的結果差異:
TABLE_LIST = DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
sd_Augmentation = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
TABLE_LIST = DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
sd_Augmentation = 0.3,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
TABLE_LIST = DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
sd_Augmentation = 0.8,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 現在,讓我們試試面對類別極度不平衡的問題,讓我們看看問題之所在:
set.seed(0)
x1 = rnorm(200, sd = 1)
x2 = rnorm(200, sd = 1)
lr1 = - 1.5 + x1^2 + x2^2 + x1*x2 + rnorm(100)
y = lr1 > 5 | lr1 < -3 + 0L
test_x1 = rnorm(200, sd = 1)
test_x2 = rnorm(200, sd = 1)
lr1 = - 1.5 + test_x1^2 + test_x2^2 + test_x1*test_x2 + rnorm(100)
test_y = lr1 > 5 | lr1 < -3 + 0L
– 讓我們看看分類結果:
TABLE_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
sd_Augmentation = 0.3,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 看起來好像很準嗎?但讓我們看看cross table的結果:
## $Train_tab
## y
## FALSE TRUE
## FALSE 179 12
## TRUE 1 8
##
## $Test_tab
## test_y
## FALSE TRUE
## FALSE 187 10
## TRUE 0 3
– 所以對於樣本極度不平衡的狀況,我們通常使用敏感度/特異度的組合作為指標。以這個結果為例,測試集的敏感度應該是\(\frac{3}{13}=23.1 \%\),而特異度是\(\frac{187}{187}=100 \%\),因此平衡精確度是\(\frac{23.1 \% + 100 \%}{2} = 61.5 \%\)。
– 所以我們的目標是,想一個資料擴增的方法盡可能提高平衡精確度的值,並且在輸出的時候將圖像改成描述平衡精確度的值!
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
sd_Augmentation = 0, oversampling = TRUE,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
#Oversampling
if (oversampling) {
idx_pos = sample(which(y == 1), batch_size/2, replace = TRUE)
idx_neg = sample(which(y == 0), batch_size/2, replace = TRUE)
idx = c(idx_pos, idx_neg)
} else {
idx = sample(1:length(x1), batch_size, replace = TRUE)
}
#Augmentation
New_X_matrix = X_matrix[idx,]
New_X_matrix = New_X_matrix + rnorm(prod(dim(New_X_matrix)), sd = sd_Augmentation)
New_y = y[idx]
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = New_X_matrix, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = New_y, eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2) {
new_X = cbind(x1, x2)
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
pred_y = pre_func(x1 = x1, x2 = x2)
train_table = table(factor(pred_y > 0.5, levels = c(FALSE, TRUE)), factor(y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0('Train-BA:', formatC((train_table[1,1]/sum(train_table[,1]) + train_table[2,2]/sum(train_table[,2]))/2, 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
test_table = table(factor(pred_test_y > 0.5, levels = c(FALSE, TRUE)), factor(test_y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0(MAIN_TXT, '; Test-BA:', formatC((test_table[1,1]/sum(test_table[,1]) + test_table[2,2]/sum(test_table[,2]))/2, 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
return(list(pred_y = pred_y, pred_test_y = pred_test_y))
}
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999,
optimizer = 'sgd',
sd_Augmentation = 0.3, oversampling = FALSE,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = c(100), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999,
optimizer = 'sgd',
sd_Augmentation = 0.3, oversampling = TRUE,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
table(factor(PRED_LIST$pred_test_y > 0.5, levels = c(FALSE, TRUE)), factor(test_y, levels = c(FALSE, TRUE)))
##
## FALSE TRUE
## FALSE 154 4
## TRUE 33 9
– 事實上你可以使用ROC曲線來判斷最佳切點的位置:
library(PRROC)
ROC = roc.curve(PRED_LIST$pred_test_y[test_y == 1], PRED_LIST$pred_test_y[test_y == 0], curve = TRUE)
plot(ROC)
table(factor(PRED_LIST$pred_test_y > 0.625, levels = c(FALSE, TRUE)), factor(test_y, levels = c(FALSE, TRUE)))
##
## FALSE TRUE
## FALSE 179 5
## TRUE 8 8
– 第二堂課所提到在2012年他一舉摘下了ILSVRC競賽冠軍的AlexNet,在其論文:ImageNet Classification with Deep Convolutional Neural Networks中描述到大量對圖像數據進行資料擴增的手段,這也是使深度學習技術準確度大幅度超過傳統方法的功臣之一。
– 儘管我們目前尚未接觸到圖像資料,但我們還是能夠大概跟著教程學習一下他的用法,這個套件讓我們能方便的對圖像進行許多預處理,包含剪裁、旋轉、縮放、灰化、ZCA白化等一系列手段,這些都有助於我們在未來做圖像辨識時可以簡單的進行資料擴增。
– 在統計問題中,有一系列的統計方法用到了正則化技術,而主要所希望解決的問題是「變數多個案少」的情況,而這與我們在使用100個隱藏層的MLP所遇到的「待解權重過多」的情況有異曲同工之妙。
L1 Regularization - 代表技術有套索迴歸(LASSO regression)
L2 Regularization - 代表技術有脊迴歸(Ridge regression)
現在講了一大堆統計名詞,這把事情弄得很玄讓人搞不清楚(想要唬弄一下人或賣弄自己的知識效果還不錯),但實際上這件事情寫成數學式子非常的簡單,他其實就是修改損失函數。
我們以線性迴歸為例,我們看看標準的預測函數及損失函數長成甚麼樣子:
\[ \begin{align} \hat{y_i} & = b_{0} + b_{1}x_i \\ loss & = \frac{{1}}{2n}\sum \limits_{i=1}^{n} \left(y_{i} - \hat{y_i}\right)^{2} \end{align} \]
– 而L1 Regularization的意思,就是對\(loss\)的部分做一些修正: (這裡\(\lambda\)是一個懲罰權重) \[ \begin{align} loss & = \frac{{1}}{2n}\sum \limits_{i=1}^{n} \left(y_{i} - \hat{y_i}\right)^{2} + \lambda ( |b_0| + |b_1|) \end{align} \]
– L2 Regularization與L1 Regularization非常相似,只是對\(loss\)的修正方式不同:
\[ \begin{align} loss & = \frac{{1}}{2n}\sum \limits_{i=1}^{n} \left(y_{i} - \hat{y_i}\right)^{2} + \frac{\lambda}{2} ( b_0^2 + b_1^2) \end{align} \]
– 而我們再考慮一下他的物理意義,權重為0代表該項輸入並不重要,因此也代表該變項可有可無,所以在MLP中使用正則化技術會讓其中很多神經元間的連接權重為0(意味著沒有連接),這樣可以看成一種稀疏連結,這是否更像大腦的構造呢?
– 整合預測函數與損失函數
\[loss = \frac{{1}}{2n}\sum \limits_{i=1}^{n} \left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right)^{2} + \frac{\lambda}{2} ( b_0^2 + b_1^2)\]
– \(b_0\)以及\(b_1\)的偏導函數(過程略)
\[ \begin{align} \frac{\partial}{\partial b_0}loss & = \frac{{1}}{n} \sum \limits_{i=1}^{n}\left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right) + \lambda b_0\\ \frac{\partial}{\partial b_1}loss & = \frac{{1}}{n} \sum \limits_{i=1}^{n}\left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right)\cdot x_{i} + \lambda b_1 \end{align} \]
– 很明顯的,當\(\lambda\)很大的時候,那所有權重都將會很難離開0。
– 這裡我們引入符號\(||W||^2\)代表向量(矩陣)\(W\)的距離值(也就是平方和)
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ o & = S(l_2) \\\\ loss & = CE(y, o) + L2Reg(W) \\ & = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) + \frac{\lambda}{2} ||W^2_1||^2 + \frac{\lambda}{2} ||W^1_d||^2 \end{align} \]
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 + \frac{\partial}{\partial W^2_1}\frac{\lambda}{2}||W^2_1||^2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2 + \lambda W^2_1\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 + \frac{\partial}{\partial W^1_d}\frac{\lambda}{2}||W^1_d||^2 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 + \lambda W^1_d \end{align} \]
– 需要注意的是,由於梯度計算完成後我們通常會再乘上一個學習率\(lr\),所以最後的衰減率會是\(lr \times \lambda\)。為了避免應用上的麻煩,我們通常會把這一項獨立出來不讓他乘以學習率,從而固定其影響力。
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 1e-4,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
#Oversampling
if (oversampling) {
idx_pos = sample(which(y == 1), batch_size/2, replace = TRUE)
idx_neg = sample(which(y == 0), batch_size/2, replace = TRUE)
idx = c(idx_pos, idx_neg)
} else {
idx = sample(1:length(x1), batch_size, replace = TRUE)
}
#Augmentation
New_X_matrix = X_matrix[idx,]
New_X_matrix = New_X_matrix + rnorm(prod(dim(New_X_matrix)), sd = sd_Augmentation)
New_y = y[idx]
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = New_X_matrix, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
#---------------------------------------------------------#
loss_seq[i] = CE.fun(o = current_o, y = New_y, eps = eps) + wd/2*sum(sapply(W_list, function(x) {sum(x^2)}))
#---------------------------------------------------------#
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
#---------------------------------------------------------#
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps) - wd * W_list[[j]]
#---------------------------------------------------------#
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
#---------------------------------------------------------#
W_list[[j]] = W_list[[j]] - M_list[[j]] - wd * W_list[[j]]
#---------------------------------------------------------#
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2) {
new_X = cbind(x1, x2)
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
pred_y = pre_func(x1 = x1, x2 = x2)
train_table = table(factor(pred_y > 0.5, levels = c(FALSE, TRUE)), factor(y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0('Train-BA:', formatC((train_table[1,1]/sum(train_table[,1]) + train_table[2,2]/sum(train_table[,2]))/2, 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
test_table = table(factor(pred_test_y > 0.5, levels = c(FALSE, TRUE)), factor(test_y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0(MAIN_TXT, '; Test-BA:', formatC((test_table[1,1]/sum(test_table[,1]) + test_table[2,2]/sum(test_table[,2]))/2, 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
return(list(pred_y = pred_y, pred_test_y = pred_test_y))
}
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0.3, oversampling = FALSE, wd = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0.001,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0.3, oversampling = FALSE, wd = 0.001,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 我們要再次回到2012年,當多倫多大學的Alex Krizhevsky、Ilya Sutskever以及Geoffrey Hinton重新使用神經網路AlexNet成功奪冠時,世界上其他人都犯蠢了都沒想到這個方法?事實上,能夠在取得如此重大的突破絕對是同時解決了許多世紀難題才有的成果!
– 這邊帶大家稍微了解一下時代背景,我們先不考慮計算能力只關注在統計問題。整個AlexNet待解有超過6200萬,而當時ImageNet資料庫所能提供的也只有120萬的數據量,因此過擬合問題非常嚴重,即使運用了資料擴增、正則化都並不足以解決這個問題。
– 這時候如果加入了Dropout機制,也就是在每次考試時隨機的抽走一半的學生,讓剩下的學生進行考試,這是不是會更刺激每個學生學習?
\[ dropout(x) = \left\{ \begin{array} - \frac{x}{1-dp} & \mbox{ otherwise} \\ 0 & \mbox{ if sampled (rate = dp)} \end{array} \right. \]
– 這裡你會發現,由於\(dropout(x)\)與\(ReLU(x)\)非常非常的像,所以他們的偏導函數也非常類似:
\[ \frac{\partial}{\partial x}dropout(x) = \left\{ \begin{array} - \frac{1}{1 -dp} & \mbox{ otherwise} \\ 0 & \mbox{ if sampled (rate = dp)} \end{array} \right. \]
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ dp_1 & = dropout(l_1) \\ h_1 & = ReLU(dp_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ dp_2 & = dropout(l_2) \\ o & = S(dp_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.dp_2 & = \frac{\partial}{\partial dp_2}loss = grad.o \otimes \frac{\partial}{\partial dp_2}o= o-y \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.dp_2 \otimes \frac{\partial}{\partial l_2}dp_2 = (o-y) \otimes \frac{\partial}{\partial l_2}dropout(l_2) \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.dp_1 & = \frac{\partial}{\partial dp_1}loss = grad.h_1 \otimes \frac{\partial}{\partial dp_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial dp_1}ReLU(dp_1) \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.dp_1 \otimes \frac{\partial}{\partial l_1}dp_1 = grad.dp_1 \otimes \frac{\partial}{\partial l_1}dropout(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 1e-4, dp = 0.5,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
#---------------------------------------------#
dropout.fun = function (x, dp = dp) {
len_x = length(x)
x[sample(1:len_x, len_x * dp)] <- 0
return(x/(1-dp))
}
#---------------------------------------------#
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
#---------------------------------------------#
grad_dp.fun = function (grad_h, DP, dp = dp) {
de_DP = DP
de_DP[de_DP<0] = 0
de_DP[de_DP>0] = 1
return(grad_h*de_DP)
}
grad_l.fun = function (grad_dp, DP, dp = dp) {
de_DP = DP
de_DP[de_DP == 0] = 0
de_DP[de_DP != 0] = 1/(1-dp)
return(grad_dp*de_DP)
}
#---------------------------------------------#
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
#Oversampling
if (oversampling) {
idx_pos = sample(which(y == 1), batch_size/2, replace = TRUE)
idx_neg = sample(which(y == 0), batch_size/2, replace = TRUE)
idx = c(idx_pos, idx_neg)
} else {
idx = sample(1:length(x1), batch_size, replace = TRUE)
}
#Augmentation
New_X_matrix = X_matrix[idx,]
New_X_matrix = New_X_matrix + rnorm(prod(dim(New_X_matrix)), sd = sd_Augmentation)
New_y = y[idx]
#Forward
current_l_list = list()
#---------------------------------------------#
current_dp_list = list()
#---------------------------------------------#
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = New_X_matrix, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
#---------------------------------------------#
current_dp_list[[j]] = dropout.fun(x = current_l_list[[j]], dp = dp)
current_h_list[[j]] = ReLU.fun(x = current_dp_list[[j]])
#---------------------------------------------#
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
#---------------------------------------------#
current_dp_list[[len_h+1]] = dropout.fun(x = current_l_list[[len_h+1]], dp = dp)
current_o = S.fun(x = current_dp_list[[len_h+1]], eps = eps)
#---------------------------------------------#
loss_seq[i] = CE.fun(o = current_o, y = New_y, eps = eps) + wd/2*sum(sapply(W_list, function(x) {sum(x^2)}))
#Backward
current_grad_l_list = list()
#---------------------------------------------#
current_grad_dp_list = list()
#---------------------------------------------#
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
#---------------------------------------------#
current_grad_dp_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_l_list[[len_h+1]] = grad_l.fun(grad_dp = current_grad_dp_list[[len_h+1]], DP = current_dp_list[[len_h+1]], dp = dp)
#---------------------------------------------#
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
#---------------------------------------------#
current_grad_dp_list[[j]] = grad_dp.fun(grad_h = current_grad_h_list[[j]], DP = current_dp_list[[j]], dp = dp)
current_grad_l_list[[j]] = grad_l.fun(grad_dp = current_grad_dp_list[[j]], DP = current_dp_list[[j]], dp = dp)
#---------------------------------------------#
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps) - wd * W_list[[j]]
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]] - wd * W_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2) {
new_X = cbind(x1, x2)
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
pred_y = pre_func(x1 = x1, x2 = x2)
train_table = table(factor(pred_y > 0.5, levels = c(FALSE, TRUE)), factor(y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0('Train-BA:', formatC((train_table[1,1]/sum(train_table[,1]) + train_table[2,2]/sum(train_table[,2]))/2, 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
test_table = table(factor(pred_test_y > 0.5, levels = c(FALSE, TRUE)), factor(test_y, levels = c(FALSE, TRUE)))
MAIN_TXT = paste0(MAIN_TXT, '; Test-BA:', formatC((test_table[1,1]/sum(test_table[,1]) + test_table[2,2]/sum(test_table[,2]))/2, 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
return(list(pred_y = pred_y, pred_test_y = pred_test_y))
}
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0, dp = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0.01, dp = 0,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0, dp = 0.5,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
PRED_LIST = DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 100, batch_size = 30,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd', eps = 1e-8,
sd_Augmentation = 0, oversampling = FALSE, wd = 0.001, dp = 0.5,
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 我們對模型的擴展也稍微學會了一點技巧,那就是只要我們將最後一層換掉設計恰當的函數就可以應付各種狀況。同理,我們也可以換掉輸入層的函數從而達到任意輸入/輸出的預測函數!(當然這件事還是有點難…)
– 因此,目前對於深度學習開發的主流都是透過「框架/平台」,由你指定一個由「眾多可微分層累加的預測函數」,而「框架/平台」負責利用連鎖律幫你解微分,從而獲得梯度優化整個網路取得權重參數,因此之後的課程我們將開始使用「框架/平台」進行開發。
– 到時候假使我們遇到了一個新的函數,那也只要解出該函數的偏導函數你就應該有能力將其應用於自己的模型中。